##lOADING ALL THE PACKAGES
rm(list=ls(all=TRUE))
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-1
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(MASS)
library(vegan)
## Loading required package: permute
## This is vegan 2.5-7
##
## Attaching package: 'vegan'
## The following object is masked from 'package:caret':
##
## tolerance
library(data.table)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(DMwR)
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(e1071)
library(standardize)
##
## ***********************************************************
## Loading standardize package version 0.2.2
## Call standardize.news() to see new features/changes
## ***********************************************************
library(ggplot2)
library(arules)
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
setwd("~/Linear and Non Linear/Data")
one=read.csv("Train.csv",na.strings=c("","NA"))
two=read.csv("Train_Claim.csv",na.strings=c("","NA"))
three=read.csv("Train_Demographics.csv",na.strings=c("","NA"))
four=read.csv("Train_Policy.csv",na.strings=c("","NA"))
five=read.csv("Train_Vehicle.csv")
#Converting all the missing values to NA
#for Data of VEhicle
five$VehicleAttributeDetails=as.factor(ifelse(five$VehicleAttributeDetails=="???",NA,five$VehicleAttributeDetails))
#for Claim INformation
two$TypeOfCollission=as.factor(ifelse(two$TypeOfCollission=="?",NA,two$TypeOfCollission))
two$IncidentTime=as.factor(ifelse(two$IncidentTime=="-5",NA,two$IncidentTime))
two$PropertyDamage=as.factor(ifelse(two$PropertyDamage=="?",NA,two$PropertyDamage))
two$Witnesses=as.factor(ifelse(two$Witnesses=="MISSINGVALUE",NA,two$Witnesses))
two$PoliceReport=as.factor(ifelse(two$PoliceReport=="?",NA,two$PoliceReport))
two$AmountOfTotalClaim=(ifelse(two$AmountOfTotalClaim=="MISSEDDATA",NA,two$AmountOfTotalClaim))
#for policy INFORMATIOn
four$PolicyAnnualPremium=(ifelse(four$PolicyAnnualPremium=="-1",NA,four$PolicyAnnualPremium))
colSums(is.na(five))
## CustomerID VehicleAttribute VehicleAttributeDetails
## 0 0 50
colSums(is.na(two))
## CustomerID DateOfIncident TypeOfIncident
## 0 0 0
## TypeOfCollission SeverityOfIncident AuthoritiesContacted
## 5162 0 0
## IncidentState IncidentCity IncidentAddress
## 0 0 0
## IncidentTime NumberOfVehicles PropertyDamage
## 31 0 10459
## BodilyInjuries Witnesses PoliceReport
## 0 46 9805
## AmountOfTotalClaim AmountOfInjuryClaim AmountOfPropertyClaim
## 50 0 0
## AmountOfVehicleDamage
## 0
colSums(is.na(four))
## InsurancePolicyNumber CustomerLoyaltyPeriod
## 0 0
## DateOfPolicyCoverage InsurancePolicyState
## 0 0
## Policy_CombinedSingleLimit Policy_Deductible
## 0 0
## PolicyAnnualPremium UmbrellaLimit
## 141 0
## InsuredRelationship CustomerID
## 0 0
colSums(is.na(three))
## CustomerID InsuredAge InsuredZipCode
## 0 0 0
## InsuredGender InsuredEducationLevel InsuredOccupation
## 30 0 0
## InsuredHobbies CapitalGains CapitalLoss
## 0 0 0
## Country
## 2
five_one=five[five$VehicleAttribute %in% c("VehicleID"),]
five_one$VehicleAttributeDetails=as.factor(as.character(five_one$VehicleAttributeDetails))
five_one=five_one[,-2]
str(five_one) # we can delete Five_one i.e VEHICLE ID as iT OBVIOUSLY HAS all different factors
## 'data.frame': 28836 obs. of 2 variables:
## $ CustomerID : chr "Cust20179" "Cust13038" "Cust1801" "Cust14947" ...
## $ VehicleAttributeDetails: Factor w/ 28836 levels "Vehicle10000",..: 28005 16377 11360 4017 15386 9527 16914 25689 23654 23148 ...
five_two=five[five$VehicleAttribute %in% c("VehicleMake"),]
five_two$VehicleAttributeDetails2=as.factor(as.character(five_two$VehicleAttributeDetails))
str(five_two)
## 'data.frame': 28836 obs. of 4 variables:
## $ CustomerID : chr "Cust33335" "Cust20624" "Cust9006" "Cust18447" ...
## $ VehicleAttribute : chr "VehicleMake" "VehicleMake" "VehicleMake" "VehicleMake" ...
## $ VehicleAttributeDetails : Factor w/ 28910 levels "1995","1996",..: 69 34 29 47 31 64 66 30 30 69 ...
## $ VehicleAttributeDetails2: Factor w/ 14 levels "Accura","Audi",..: 13 4 1 7 3 11 12 2 2 13 ...
five_two=five_two[,c(-2,-3)]
five_three=five[five$VehicleAttribute %in% c("VehicleModel"),]
five_three$VehicleAttributeDetails3=as.factor(as.character(five_three$VehicleAttributeDetails))
five_three=five_three[,c(-2,-3)]
five_four=five[five$VehicleAttribute %in% c("VehicleYOM"),]
five_four$VehicleAttributeDetails4=as.factor(as.character(five_four$VehicleAttributeDetails))
five_four=five_four[,c(-2,-3)]
str(five_four)
## 'data.frame': 28836 obs. of 2 variables:
## $ CustomerID : chr "Cust21334" "Cust26634" "Cust21432" "Cust22845" ...
## $ VehicleAttributeDetails4: Factor w/ 21 levels "1995","1996",..: 2 5 8 6 9 4 19 8 17 7 ...
#merging all the csv files
merge1 <- merge(five_three,five_two,by="CustomerID")
merge2 <- merge(merge1,five_four,by="CustomerID")
merge3 <- merge(merge2,four,by="CustomerID")
merge4 <- merge(merge3,three,by="CustomerID")
total <- merge(merge4,two,by="CustomerID")
total$InsurancePolicyNumber=as.factor(total$InsurancePolicyNumber)
#deleting customer ID and insurance policy number
total=total[,setdiff(names(total),c("InsurancePolicyNumber","Country","AmountOfPropertyClaim_disc","AmountOfTotalClaim_disc","AmountOfInjuryClaim_disc"))]
total$InsuredZipCode=as.factor(as.integer(total$InsuredZipCode/1000))
str(total)
## 'data.frame': 28836 obs. of 38 variables:
## $ CustomerID : chr "Cust10000" "Cust10001" "Cust10002" "Cust10003" ...
## $ VehicleAttributeDetails3 : Factor w/ 39 levels "3 Series","92x",..: 6 6 21 21 12 12 8 8 29 2 ...
## $ VehicleAttributeDetails2 : Factor w/ 14 levels "Accura","Audi",..: 2 2 14 14 13 13 9 12 14 11 ...
## $ VehicleAttributeDetails4 : Factor w/ 21 levels "1995","1996",..: 14 12 5 9 16 17 6 16 1 10 ...
## $ CustomerLoyaltyPeriod : int 49 114 167 190 115 101 471 340 81 328 ...
## $ DateOfPolicyCoverage : chr "1998-10-25" "2000-11-15" "2001-02-12" "2005-04-11" ...
## $ InsurancePolicyState : chr "State1" "State1" "State3" "State2" ...
## $ Policy_CombinedSingleLimit: chr "100/300" "100/300" "500/1000" "500/1000" ...
## $ Policy_Deductible : int 1000 1000 617 722 500 500 512 877 2000 1000 ...
## $ PolicyAnnualPremium : num 1633 1255 1373 1338 1354 ...
## $ UmbrellaLimit : int 0 0 0 0 4279863 3921366 165819 5282219 0 0 ...
## $ InsuredRelationship : chr "not-in-family" "not-in-family" "wife" "own-child" ...
## $ InsuredAge : int 35 36 33 36 29 28 57 49 27 48 ...
## $ InsuredZipCode : Factor w/ 71 levels "430","431","432",..: 25 25 54 45 28 28 47 47 3 37 ...
## $ InsuredGender : chr "MALE" "MALE" "MALE" "MALE" ...
## $ InsuredEducationLevel : chr "JD" "JD" "JD" "JD" ...
## $ InsuredOccupation : chr "armed-forces" "tech-support" "armed-forces" "armed-forces" ...
## $ InsuredHobbies : chr "movies" "cross-fit" "polo" "polo" ...
## $ CapitalGains : int 56700 70600 66400 47900 0 0 67400 67400 56400 53300 ...
## $ CapitalLoss : int -48500 -48500 -63700 -73400 -41500 -41500 0 0 -32800 0 ...
## $ DateOfIncident : chr "2015-02-03" "2015-02-02" "2015-01-15" "2015-01-19" ...
## $ TypeOfIncident : chr "Multi-vehicle Collision" "Multi-vehicle Collision" "Single Vehicle Collision" "Single Vehicle Collision" ...
## $ TypeOfCollission : Factor w/ 3 levels "Front Collision",..: 3 3 3 3 2 2 1 1 1 3 ...
## $ SeverityOfIncident : chr "Total Loss" "Total Loss" "Minor Damage" "Minor Damage" ...
## $ AuthoritiesContacted : chr "Police" "Police" "Other" "Other" ...
## $ IncidentState : chr "State7" "State7" "State8" "State9" ...
## $ IncidentCity : chr "City1" "City5" "City6" "City6" ...
## $ IncidentAddress : chr "Location 1311" "Location 1311" "Location 2081" "Location 2081" ...
## $ IncidentTime : Factor w/ 24 levels "0","1","2","3",..: 18 11 23 23 11 8 21 19 4 6 ...
## $ NumberOfVehicles : int 3 3 1 1 1 1 1 1 3 1 ...
## $ PropertyDamage : Factor w/ 2 levels "NO","YES": NA 2 2 2 1 1 NA NA 2 2 ...
## $ BodilyInjuries : int 1 2 2 2 2 1 0 0 0 1 ...
## $ Witnesses : Factor w/ 4 levels "0","1","2","3": 1 2 4 4 2 3 3 3 1 3 ...
## $ PoliceReport : Factor w/ 2 levels "NO","YES": NA 2 1 1 2 NA 1 1 NA 2 ...
## $ AmountOfTotalClaim : chr "65501" "61382" "66755" "66243" ...
## $ AmountOfInjuryClaim : int 13417 15560 11630 12003 8829 7818 6476 5738 6788 6510 ...
## $ AmountOfPropertyClaim : int 6071 5919 11630 12003 7234 8132 12822 7333 7504 13020 ...
## $ AmountOfVehicleDamage : int 46013 39903 43495 42237 37481 37217 58155 47498 53584 52080 ...
#trying to fill NA's of insured gender using insured relationship
total$InsuredGender=as.factor(ifelse(total$InsuredRelationship=="wife","2",total$InsuredGender))
total$InsuredGender=as.factor(ifelse(total$InsuredRelationship=="husband","1",total$InsuredGender))
#Central imputation on the enitre dataset
finaldata=centralImputation(total)
###feature engineering
#finding the number of days incident happened before accident
finaldata$DateOfIncident=as.Date(finaldata$DateOfIncident, format = "%Y-%m-%d")
finaldata$DateOfPolicyCoverage=as.Date(finaldata$DateOfPolicyCoverage, format = "%Y-%m-%d")
finaldata$incident_coverage=as.integer(finaldata$DateOfIncident-finaldata$DateOfPolicyCoverage)
finaldata$daysremainingforinsurance=finaldata$InsuredAge*365-finaldata$incident_coverage
final <- merge(one,finaldata,by="CustomerID")
final=final[,-1]
str(final)
## 'data.frame': 28836 obs. of 40 variables:
## $ ReportedFraud : chr "N" "N" "N" "N" ...
## $ VehicleAttributeDetails3 : Factor w/ 39 levels "3 Series","92x",..: 6 6 21 21 12 12 8 8 29 2 ...
## $ VehicleAttributeDetails2 : Factor w/ 14 levels "Accura","Audi",..: 2 2 14 14 13 13 9 12 14 11 ...
## $ VehicleAttributeDetails4 : Factor w/ 21 levels "1995","1996",..: 14 12 5 9 16 17 6 16 1 10 ...
## $ CustomerLoyaltyPeriod : int 49 114 167 190 115 101 471 340 81 328 ...
## $ DateOfPolicyCoverage : Date, format: "1998-10-25" "2000-11-15" ...
## $ InsurancePolicyState : chr "State1" "State1" "State3" "State2" ...
## $ Policy_CombinedSingleLimit: chr "100/300" "100/300" "500/1000" "500/1000" ...
## $ Policy_Deductible : int 1000 1000 617 722 500 500 512 877 2000 1000 ...
## $ PolicyAnnualPremium : num 1633 1255 1373 1338 1354 ...
## $ UmbrellaLimit : int 0 0 0 0 4279863 3921366 165819 5282219 0 0 ...
## $ InsuredRelationship : chr "not-in-family" "not-in-family" "wife" "own-child" ...
## $ InsuredAge : int 35 36 33 36 29 28 57 49 27 48 ...
## $ InsuredZipCode : Factor w/ 71 levels "430","431","432",..: 25 25 54 45 28 28 47 47 3 37 ...
## $ InsuredGender : Factor w/ 3 levels "1","2","3": 3 3 1 3 2 2 3 3 2 1 ...
## $ InsuredEducationLevel : chr "JD" "JD" "JD" "JD" ...
## $ InsuredOccupation : chr "armed-forces" "tech-support" "armed-forces" "armed-forces" ...
## $ InsuredHobbies : chr "movies" "cross-fit" "polo" "polo" ...
## $ CapitalGains : int 56700 70600 66400 47900 0 0 67400 67400 56400 53300 ...
## $ CapitalLoss : int -48500 -48500 -63700 -73400 -41500 -41500 0 0 -32800 0 ...
## $ DateOfIncident : Date, format: "2015-02-03" "2015-02-02" ...
## $ TypeOfIncident : chr "Multi-vehicle Collision" "Multi-vehicle Collision" "Single Vehicle Collision" "Single Vehicle Collision" ...
## $ TypeOfCollission : Factor w/ 3 levels "Front Collision",..: 3 3 3 3 2 2 1 1 1 3 ...
## $ SeverityOfIncident : chr "Total Loss" "Total Loss" "Minor Damage" "Minor Damage" ...
## $ AuthoritiesContacted : chr "Police" "Police" "Other" "Other" ...
## $ IncidentState : chr "State7" "State7" "State8" "State9" ...
## $ IncidentCity : chr "City1" "City5" "City6" "City6" ...
## $ IncidentAddress : chr "Location 1311" "Location 1311" "Location 2081" "Location 2081" ...
## $ IncidentTime : Factor w/ 24 levels "0","1","2","3",..: 18 11 23 23 11 8 21 19 4 6 ...
## $ NumberOfVehicles : int 3 3 1 1 1 1 1 1 3 1 ...
## $ PropertyDamage : Factor w/ 2 levels "NO","YES": 1 2 2 2 1 1 1 1 2 2 ...
## $ BodilyInjuries : int 1 2 2 2 2 1 0 0 0 1 ...
## $ Witnesses : Factor w/ 4 levels "0","1","2","3": 1 2 4 4 2 3 3 3 1 3 ...
## $ PoliceReport : Factor w/ 2 levels "NO","YES": 1 2 1 1 2 1 1 1 1 2 ...
## $ AmountOfTotalClaim : chr "65501" "61382" "66755" "66243" ...
## $ AmountOfInjuryClaim : int 13417 15560 11630 12003 8829 7818 6476 5738 6788 6510 ...
## $ AmountOfPropertyClaim : int 6071 5919 11630 12003 7234 8132 12822 7333 7504 13020 ...
## $ AmountOfVehicleDamage : int 46013 39903 43495 42237 37481 37217 58155 47498 53584 52080 ...
## $ incident_coverage : int 5945 5192 5085 3570 6650 5585 7286 7994 6117 100 ...
## $ daysremainingforinsurance : num 6830 7948 6960 9570 3935 ...
#Distribution of y variable
table(final$ReportedFraud)
##
## N Y
## 21051 7785
###Checking for correlation between the numeric variables
nums <- unlist(lapply(final, is.numeric))
numeric_data = final[,nums]
cor = round(cor(numeric_data),1)
cor[upper.tri(cor)] = " "
cat("Correlation Plot\n")
## Correlation Plot
cor
## CustomerLoyaltyPeriod Policy_Deductible
## CustomerLoyaltyPeriod "1" " "
## Policy_Deductible "0.1" "1"
## PolicyAnnualPremium "0" "0"
## UmbrellaLimit "0" "0"
## InsuredAge "0.9" "0.1"
## CapitalGains "0" "0"
## CapitalLoss "0" "0"
## NumberOfVehicles "0" "0"
## BodilyInjuries "0" "0"
## AmountOfInjuryClaim "0.1" "0"
## AmountOfPropertyClaim "0.1" "0.1"
## AmountOfVehicleDamage "0.1" "0"
## incident_coverage "0.1" "0.1"
## daysremainingforinsurance "0.7" "0"
## PolicyAnnualPremium UmbrellaLimit InsuredAge
## CustomerLoyaltyPeriod " " " " " "
## Policy_Deductible " " " " " "
## PolicyAnnualPremium "1" " " " "
## UmbrellaLimit "0" "1" " "
## InsuredAge "0" "0" "1"
## CapitalGains "0" "0" "0"
## CapitalLoss "0" "0" "0"
## NumberOfVehicles "-0.1" "0" "0"
## BodilyInjuries "0" "0" "0"
## AmountOfInjuryClaim "0" "0" "0.1"
## AmountOfPropertyClaim "0" "0" "0.1"
## AmountOfVehicleDamage "0" "0" "0.1"
## incident_coverage "0" "0" "0.1"
## daysremainingforinsurance "0" "0" "0.8"
## CapitalGains CapitalLoss NumberOfVehicles
## CustomerLoyaltyPeriod " " " " " "
## Policy_Deductible " " " " " "
## PolicyAnnualPremium " " " " " "
## UmbrellaLimit " " " " " "
## InsuredAge " " " " " "
## CapitalGains "1" " " " "
## CapitalLoss "-0.1" "1" " "
## NumberOfVehicles "0.1" "0" "1"
## BodilyInjuries "0.1" "0" "0"
## AmountOfInjuryClaim "0" "0" "0.3"
## AmountOfPropertyClaim "0" "0" "0.2"
## AmountOfVehicleDamage "0" "0" "0.3"
## incident_coverage "0" "0" "0"
## daysremainingforinsurance "0" "0" "0"
## BodilyInjuries AmountOfInjuryClaim
## CustomerLoyaltyPeriod " " " "
## Policy_Deductible " " " "
## PolicyAnnualPremium " " " "
## UmbrellaLimit " " " "
## InsuredAge " " " "
## CapitalGains " " " "
## CapitalLoss " " " "
## NumberOfVehicles " " " "
## BodilyInjuries "1" " "
## AmountOfInjuryClaim "0" "1"
## AmountOfPropertyClaim "0" "0.6"
## AmountOfVehicleDamage "0" "0.8"
## incident_coverage "0" "0"
## daysremainingforinsurance "0" "0.1"
## AmountOfPropertyClaim AmountOfVehicleDamage
## CustomerLoyaltyPeriod " " " "
## Policy_Deductible " " " "
## PolicyAnnualPremium " " " "
## UmbrellaLimit " " " "
## InsuredAge " " " "
## CapitalGains " " " "
## CapitalLoss " " " "
## NumberOfVehicles " " " "
## BodilyInjuries " " " "
## AmountOfInjuryClaim " " " "
## AmountOfPropertyClaim "1" " "
## AmountOfVehicleDamage "0.8" "1"
## incident_coverage "0" "0"
## daysremainingforinsurance "0.1" "0.1"
## incident_coverage daysremainingforinsurance
## CustomerLoyaltyPeriod " " " "
## Policy_Deductible " " " "
## PolicyAnnualPremium " " " "
## UmbrellaLimit " " " "
## InsuredAge " " " "
## CapitalGains " " " "
## CapitalLoss " " " "
## NumberOfVehicles " " " "
## BodilyInjuries " " " "
## AmountOfInjuryClaim " " " "
## AmountOfPropertyClaim " " " "
## AmountOfVehicleDamage " " " "
## incident_coverage "1" " "
## daysremainingforinsurance "-0.6" "1"
###Visualization
###Visualizing all the variables V/s the target variable
for(i in 1:14){
par(mfrow=c(2,2))
print(ggplot(cbind(numeric_data,final$ReportedFraud), aes_string(x = colnames(numeric_data)[i], fill = "final$ReportedFraud")) +
geom_density(alpha = 0.5) +
labs(title = colnames(numeric_data)[i],
x = paste("x",i,sep=""),
y = "Density",
col = "ReportedFraud"))
}
library(scorecard)
data = final[,setdiff(names(final),c("InsuredZipCode","IncidentAddress","AmountOfTotalClaim"))]
woe_bins = scorecard::woebin(data, y="ReportedFraud", method="tree",positive = "Y",bin_num_limit = 4)
## [INFO] creating woe binning ...
## Warning in check_datetime_cols(dt): There were 2 date/time columns removed from input dataset,
## DateOfPolicyCoverage, DateOfIncident
data_binned = scorecard::woebin_ply(data,woe_bins)
## [INFO] converting into woe values ...
plots_demog=scorecard::woebin_plot(woe_bins, x = NULL, title = NULL, show_iv = TRUE)
par(mfrow=c(2,2))
plots_demog
## $VehicleAttributeDetails3
##
## $VehicleAttributeDetails2
##
## $VehicleAttributeDetails4
##
## $CustomerLoyaltyPeriod
##
## $InsurancePolicyState
##
## $Policy_CombinedSingleLimit
##
## $Policy_Deductible
##
## $PolicyAnnualPremium
##
## $UmbrellaLimit
##
## $InsuredRelationship
##
## $InsuredAge
##
## $InsuredGender
##
## $InsuredEducationLevel
##
## $InsuredOccupation
##
## $InsuredHobbies
##
## $CapitalGains
##
## $CapitalLoss
##
## $TypeOfIncident
##
## $TypeOfCollission
##
## $SeverityOfIncident
##
## $AuthoritiesContacted
##
## $IncidentState
##
## $IncidentCity
##
## $IncidentTime
##
## $NumberOfVehicles
##
## $PropertyDamage
##
## $BodilyInjuries
##
## $Witnesses
##
## $PoliceReport
##
## $AmountOfInjuryClaim
##
## $AmountOfPropertyClaim
##
## $AmountOfVehicleDamage
##
## $incident_coverage
##
## $daysremainingforinsurance
set.seed(1234)
final$ReportedFraud = as.factor(final$ReportedFraud)
Train_ID=caret::createDataPartition(final$ReportedFraud,p=0.8,list = F)
Train_data = final[Train_ID,]
Test_data = final[-Train_ID,]
Train_data_binned = data[Train_ID,]
Test_data_binned = data[-Train_ID,]
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following object is masked from 'package:arules':
##
## %in%
## The following objects are masked from 'package:data.table':
##
## hour, month, week, year
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 17 minutes 28 seconds
## H2O cluster timezone: America/Chicago
## H2O data parsing timezone: UTC
## H2O cluster version: 3.32.0.1
## H2O cluster version age: 5 months and 6 days !!!
## H2O cluster name: H2O_started_from_R_siddarthbalasubramani_yav906
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.92 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 4.0.4 (2021-02-15)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is too old (5 months and 6 days)!
## Please download and install the latest version from http://h2o.ai/download/
finaldata.h2o=as.h2o(Train_data)
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
##
|
| | 0%
|
|======================================================================| 100%
finaltest.h2o=as.h2o(Test_data)
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
##
|
| | 0%
|
|======================================================================| 100%
y.dep<-1
x.indep<-2:40
xgboost.model = h2o.xgboost(x = x.indep,
y = y.dep,
training_frame = finaldata.h2o,
validation_frame = finaltest.h2o,
booster = "gbtree",
seed = 1234,
nfolds = 5,
distribution="bernoulli",
eta = 0.151,
max_depth = 10,
sample_rate = 1,
col_sample_rate=1)
## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [InsuredHobbies, IncidentState, InsurancePolicyState, SeverityOfIncident, InsuredRelationship, Policy_CombinedSingleLimit, InsuredEducationLevel, IncidentAddress, IncidentCity, TypeOfIncident, AuthoritiesContacted, AmountOfTotalClaim, InsuredOccupation].
##
|
| | 0%
|
|=== | 4%
|
|======= | 9%
|
|========== | 14%
|
|============= | 18%
|
|=================== | 27%
|
|======================= | 33%
|
|=========================== | 39%
|
|=============================== | 44%
|
|================================== | 49%
|
|===================================== | 53%
|
|============================================ | 63%
|
|=============================================== | 68%
|
|================================================= | 71%
|
|=================================================== | 73%
|
|===================================================== | 76%
|
|======================================================= | 79%
|
|========================================================== | 83%
|
|============================================================ | 85%
|
|============================================================= | 88%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|=================================================================== | 96%
|
|======================================================================| 100%
h2o.auc(h2o.performance(xgboost.model))
## [1] 0.978399
perf <- h2o.performance(xgboost.model, finaltest.h2o)
perf
## H2OBinomialMetrics: xgboost
##
## MSE: 0.08766438
## RMSE: 0.2960817
## LogLoss: 0.3164162
## Mean Per-Class Error: 0.1230924
## AUC: 0.9042465
## AUCPR: 0.8592338
## Gini: 0.8084931
## R^2: 0.5552127
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## N Y Error Rate
## N 4028 182 0.043230 =182/4210
## Y 316 1241 0.202954 =316/1557
## Totals 4344 1423 0.086353 =498/5767
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.346662 0.832886 191
## 2 max f2 0.283030 0.820686 215
## 3 max f0point5 0.439468 0.876628 163
## 4 max accuracy 0.397878 0.914514 175
## 5 max precision 0.980722 1.000000 0
## 6 max recall 0.018563 1.000000 397
## 7 max specificity 0.980722 1.000000 0
## 8 max absolute_mcc 0.374774 0.777417 182
## 9 max min_per_class_accuracy 0.224053 0.849711 242
## 10 max mean_per_class_accuracy 0.311276 0.878571 204
## 11 max tns 0.980722 4210.000000 0
## 12 max fns 0.980722 1555.000000 0
## 13 max fps 0.014146 4210.000000 399
## 14 max tps 0.018563 1557.000000 397
## 15 max tnr 0.980722 1.000000 0
## 16 max fnr 0.980722 0.998715 0
## 17 max fpr 0.014146 1.000000 399
## 18 max tpr 0.018563 1.000000 397
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
h2o.varimp(xgboost.model)
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 AmountOfVehicleDamage 5327.692383 1.000000 0.093605
## 2 AmountOfInjuryClaim 3475.890625 0.652420 0.061070
## 3 AmountOfPropertyClaim 3307.105469 0.620739 0.058104
## 4 DateOfIncident 3211.359863 0.602768 0.056422
## 5 CapitalGains 2909.274658 0.546067 0.051115
##
## ---
## variable relative_importance scaled_importance
## 189 InsuredZipCode.453 5.791574 0.001087
## 190 IncidentTime.10 4.459343 0.000837
## 191 VehicleAttributeDetails3.Accord 3.391335 0.000637
## 192 InsuredZipCode.447 0.374069 0.000070
## 193 InsuredZipCode.610 0.238735 0.000045
## 194 InsuredZipCode.430 0.054365 0.000010
## percentage
## 189 0.000102
## 190 0.000078
## 191 0.000060
## 192 0.000007
## 193 0.000004
## 194 0.000001
glm.model <- h2o.glm(y=y.dep, x=x.indep, training_frame = finaldata.h2o, family = "binomial",nfolds=5)
## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [InsuredHobbies, IncidentState, InsurancePolicyState, SeverityOfIncident, InsuredRelationship, Policy_CombinedSingleLimit, InsuredEducationLevel, IncidentAddress, IncidentCity, TypeOfIncident, AuthoritiesContacted, AmountOfTotalClaim, InsuredOccupation].
##
|
| | 0%
|
|======================================================================| 100%
h2o.auc(h2o.performance(glm.model))
## [1] 0.7561092
#predict.dl2 <- as.data.frame(h2o.predict(gbm.model, finaltest.h2o))
perf <- h2o.performance(glm.model, finaltest.h2o)
perf
## H2OBinomialMetrics: glm
##
## MSE: 0.1680515
## RMSE: 0.4099409
## LogLoss: 0.5101557
## Mean Per-Class Error: 0.3186181
## AUC: 0.7408383
## AUCPR: 0.5101469
## Gini: 0.4816767
## R^2: 0.1473484
## Residual Deviance: 5884.136
## AIC: 6246.136
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## N Y Error Rate
## N 3228 982 0.233254 =982/4210
## Y 629 928 0.403982 =629/1557
## Totals 3857 1910 0.279348 =1611/5767
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.328587 0.535333 185
## 2 max f2 0.150035 0.679771 306
## 3 max f0point5 0.368664 0.520339 163
## 4 max accuracy 0.459682 0.754118 117
## 5 max precision 0.898059 1.000000 0
## 6 max recall 0.017053 1.000000 398
## 7 max specificity 0.898059 1.000000 0
## 8 max absolute_mcc 0.360930 0.345253 167
## 9 max min_per_class_accuracy 0.275199 0.677197 218
## 10 max mean_per_class_accuracy 0.328587 0.681382 185
## 11 max tns 0.898059 4210.000000 0
## 12 max fns 0.898059 1556.000000 0
## 13 max fps 0.012936 4210.000000 399
## 14 max tps 0.017053 1557.000000 398
## 15 max tnr 0.898059 1.000000 0
## 16 max fnr 0.898059 0.999358 0
## 17 max fpr 0.012936 1.000000 399
## 18 max tpr 0.017053 1.000000 398
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`